What do I want?

In classifier_comparison I train and compare classifiers. Furthermore I've saved Random Forest (RF) scores into ../data/galaxy_images_training/*-dwarf_galaxy_scores.csv.

Now I'm just going to make some publication-grade plots of various metrics on the RF approach.

Code



In [1]:

    
# give access to importing dwarfz
import os, sys
dwarfz_package_dir = os.getcwd().split("dwarfz")[0]
if dwarfz_package_dir not in sys.path:
    sys.path.insert(0, dwarfz_package_dir)

import dwarfz
    
# back to regular import statements

%matplotlib inline
from matplotlib import pyplot as plt
import matplotlib as mpl

import seaborn as sns
sns.set(context="poster", style="ticks", 
#         font_scale=1.4,
       )

import numpy as np
import pandas as pd

from scipy.special import expit
import pathlib



In [2]:

    
mpl.rcParams['savefig.dpi'] = 80
mpl.rcParams['figure.dpi'] = 80
mpl.rcParams['figure.figsize'] = np.array((8,6))
mpl.rcParams['figure.facecolor'] = "white"



In [3]:

    
COSMOS_filename = pathlib.Path(dwarfz.data_dir_default) / "COSMOS_reference.sqlite"
COSMOS = dwarfz.datasets.COSMOS(COSMOS_filename)



In [4]:

    
HSC_filename = pathlib.Path(dwarfz.data_dir_default) / "HSC_COSMOS_median_forced.sqlite3"
HSC = dwarfz.datasets.HSC(HSC_filename)



In [5]:

    
matches_filename = pathlib.Path(dwarfz.data_dir_default) / "matches.sqlite3"
matches_df = dwarfz.matching.Matches.load_from_filename(matches_filename)



In [6]:

    
combined = matches_df[matches_df.match].copy()
combined["ra"]       = COSMOS.df.loc[combined.index].ra
combined["dec"]      = COSMOS.df.loc[combined.index].dec
combined["photo_z"]  = COSMOS.df.loc[combined.index].photo_z
combined["log_mass"] = COSMOS.df.loc[combined.index].mass_med

photometry_cols = [
    "gcmodel_flux","gcmodel_flux_err","gcmodel_flux_flags", "gcmodel_mag",
    "rcmodel_flux","rcmodel_flux_err","rcmodel_flux_flags", "rcmodel_mag",
    "icmodel_flux","icmodel_flux_err","icmodel_flux_flags", "icmodel_mag",
    "zcmodel_flux","zcmodel_flux_err","zcmodel_flux_flags", "zcmodel_mag",
    "ycmodel_flux","ycmodel_flux_err","ycmodel_flux_flags", "ycmodel_mag",
]

for col in photometry_cols:
    combined[col] = HSC.df.loc[combined.catalog_2_ids][col].values

Turn magnitudes into colors



In [7]:

    
combined["g_minus_r"] = combined.gcmodel_mag - combined.rcmodel_mag
combined["r_minus_i"] = combined.rcmodel_mag - combined.icmodel_mag
combined["i_minus_z"] = combined.icmodel_mag - combined.zcmodel_mag
combined["z_minus_y"] = combined.zcmodel_mag - combined.ycmodel_mag

Filter out bad data



In [8]:

    
mask =    np.isfinite(combined["g_minus_r"]) & np.isfinite(combined["r_minus_i"]) \
        & np.isfinite(combined["i_minus_z"]) & np.isfinite(combined["z_minus_y"]) \
        & np.isfinite(combined["icmodel_mag"]) \
        & (~combined.gcmodel_flux_flags) & (~combined.rcmodel_flux_flags) \
        & (~combined.icmodel_flux_flags) & (~combined.zcmodel_flux_flags) \
        & (~combined.ycmodel_flux_flags)

combined = combined[mask]

Create classification labels



In [9]:

    
low_z    = (combined.photo_z  < .15)
low_mass = (combined.log_mass > 8) & (combined.log_mass < 9)



In [10]:

    
combined["low_z_low_mass"] = (low_z & low_mass)
combined.low_z_low_mass.mean()









    Out[10]:





0.002185072976027315



In [11]:

    
combined.low_z_low_mass.sum()









    Out[11]:





727



In [12]:

    
combined.shape









    Out[12]:





(332712, 33)

Get Classifier Results



In [13]:

    
plots_dir = pathlib.Path("./plots/")
if not plots_dir.is_dir():
    plots_dir.mkdir()



In [14]:

    
features = combined.loc[:,["g_minus_r", "r_minus_i", "i_minus_z", "z_minus_y",
                           "icmodel_mag",
                           "photoz_best",
                           "photoz_risk_best" # The risk of photoz_best being outside of the range z_true +- 0.15(1+z_true). It ranges from 0 (safe) to 1(risky)
                          ]]

target = combined.loc[:,["low_z_low_mass"]]









    



/Users/egentry/anaconda3/lib/python3.7/site-packages/pandas/core/indexing.py:1472: FutureWarning: 
Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)



In [15]:

    
target.mean()









    Out[15]:





low_z_low_mass    0.002185
dtype: float64



In [16]:

    
COSMOS_field_area = 2 # sq. degree
N_COSMOS_total = HSC.df.shape[0]
N_COSMOS_good = combined.shape[0]

true_dwarf_density = target.sum().values[0] / COSMOS_field_area 
print("true dwarf density: {:.2f} / sq. deg.".format(true_dwarf_density))









    



true dwarf density: 363.50 / sq. deg.



In [17]:

    
filename = "../data/galaxy_images_training/2017_09_26-dwarf_galaxy_scores.csv"
df_dwarf_prob = pd.read_csv(filename,
#                             index_label="COSMOS_id",
                           )
df_dwarf_prob.head()









    Out[17]:







  
    
      
      COSMOS_id
      HSC_id
      dwarf_prob
      low_z_low_mass
    
  
  
    
      0
      628457
      43158322471244656
      0.000000
      False
    
    
      1
      919771
      43158605939114836
      0.000000
      False
    
    
      2
      444239
      43159142810013665
      0.000000
      False
    
    
      3
      569427
      43158734788125011
      0.063333
      False
    
    
      4
      369948
      43158863637144621
      0.606667
      True



In [18]:

    
COSMOS_field_area = 2 # sq. degree
N_COSMOS_total = HSC.df.shape[0]
N_COSMOS_good = combined.shape[0]

true_dwarf_density = target.sum().values[0] / COSMOS_field_area 
print("true dwarf density: {:.2f} / sq. deg.".format(true_dwarf_density))









    



true dwarf density: 363.50 / sq. deg.



In [19]:

    
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import roc_curve, roc_auc_score

pr_auc = average_precision_score(df_dwarf_prob.low_z_low_mass, df_dwarf_prob.dwarf_prob)
roc_auc = roc_auc_score(df_dwarf_prob.low_z_low_mass, df_dwarf_prob.dwarf_prob)

print("pr_auc: ", pr_auc)
print("roc_auc: ", roc_auc)









    



pr_auc:  0.46508277270666043
roc_auc:  0.9650265827334842



In [20]:

    
print("prior probability: ", df_dwarf_prob.low_z_low_mass.mean())









    



prior probability:  0.002185072976027315



In [21]:

    
threshold_probs = expit(np.linspace(-9, 6, num=100))

completeness = np.zeros_like(threshold_probs)
purity = np.zeros_like(threshold_probs)
true_positive_rate = np.zeros_like(threshold_probs)
false_positive_rate = np.zeros_like(threshold_probs)
sample_size_reduction = np.zeros_like(threshold_probs)
objects_per_sq_deg = np.zeros_like(threshold_probs)


true_dwarf = df_dwarf_prob.low_z_low_mass
true_non_dwarf = ~df_dwarf_prob.low_z_low_mass

for i, threshold_prob in enumerate(threshold_probs):
    target_prediction = (df_dwarf_prob.dwarf_prob > threshold_prob)

    prediction_dwarf     =  target_prediction
    prediction_non_dwarf = ~target_prediction

    completeness[i] = (true_dwarf & prediction_dwarf).sum() / true_dwarf.sum() 

    purity[i] = (true_dwarf & prediction_dwarf).sum() / prediction_dwarf.sum() 

    sample_size_reduction[i] = prediction_dwarf.size / prediction_dwarf.sum()
    objects_per_sq_deg[i] = prediction_dwarf.sum() / COSMOS_field_area

    
    true_positives  = np.sum(true_dwarf     & prediction_dwarf)
    false_positives = np.sum(true_non_dwarf & prediction_dwarf)

    true_negatives  = np.sum(true_non_dwarf & prediction_non_dwarf)
    false_negatives = np.sum(true_dwarf     & prediction_non_dwarf)

    true_positive_rate[i]  = true_positives  / true_dwarf.sum()
    false_positive_rate[i] = false_positives / true_non_dwarf.sum()









    



/Users/egentry/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:22: RuntimeWarning: invalid value encountered in long_scalars
/Users/egentry/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:24: RuntimeWarning: divide by zero encountered in long_scalars



In [22]:

    
i_threshold = np.argmin(objects_per_sq_deg > 1e3)

print("objects_per_sq_deg: ", objects_per_sq_deg[i_threshold])
print("threshold prob: ", threshold_probs[i_threshold])
print("sample_size_reduction: ", sample_size_reduction[i_threshold])
print("completeness: ", completeness[i_threshold])
print("purity: ", purity[i_threshold])









    



objects_per_sq_deg:  985.0
threshold prob:  0.07692720707958853
sample_size_reduction:  168.88934010152283
completeness:  0.7634112792297112
purity:  0.2817258883248731



In [23]:

    
color_RF = "g"



In [24]:

    
plt.plot([1, *false_positive_rate, 0],
         [1, *true_positive_rate, 0],
         label="Random Forest\n(AUC={:.3f})".format(roc_auc),
         color=color_RF
        )

plt.plot([0, 1], [0, 1],
         linestyle="dashed",
         color="black",
         )

# plt.xlim(left=0, right=1)
# plt.ylim(bottom=0, top=1)

plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

plt.legend(loc="best")

filename = pathlib.Path(plots_dir) / "curve_ROC"
plt.tight_layout()
plt.savefig(filename.with_suffix(".pdf"))
plt.savefig(filename.with_suffix(".png"))



In [25]:

    
plt.plot(completeness, 
         purity,
         label="Random Forest\n(AUC={:.3f})".format(pr_auc),
         color=color_RF,
        )

plt.xlabel("Recall")
plt.ylabel("Precision")

plt.legend(loc="best")

filename = pathlib.Path(plots_dir) / "curve_PR"
plt.tight_layout()
plt.savefig(filename.with_suffix(".pdf"))
plt.savefig(filename.with_suffix(".png"))



In [26]:

    
plt.plot(objects_per_sq_deg, completeness, color=color_RF)
plt.xlabel("Number of objects / deg$^2$")
plt.ylabel("completeness")
plt.axvline(1e3, linestyle="dashed", color="black")
plt.ylim(bottom=0, top=1)
plt.xscale("log")

filename = pathlib.Path(plots_dir) / "curve_completeness"
plt.tight_layout()
plt.savefig(filename.with_suffix(".pdf"))
plt.savefig(filename.with_suffix(".png"))



In [27]:

    
plt.plot(objects_per_sq_deg, purity, color=color_RF)
plt.xlabel("Number of objects / deg$^2$")
plt.ylabel("purity")
plt.axvline(1e3, linestyle="dashed", color="black")
plt.ylim(bottom=0, top=1.1)
plt.xscale("log")

filename = pathlib.Path(plots_dir) / "curve_purity"
plt.tight_layout()
plt.savefig(filename.with_suffix(".pdf"))
plt.savefig(filename.with_suffix(".png"))



In [ ]:

	COSMOS_id	HSC_id	dwarf_prob	low_z_low_mass
0	628457	43158322471244656	0.000000	False
1	919771	43158605939114836	0.000000	False
2	444239	43159142810013665	0.000000	False
3	569427	43158734788125011	0.063333	False
4	369948	43158863637144621	0.606667	True